In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, re, json, time
from matplotlib_venn import venn2, venn3
from scipy import stats
from utility_functions import *
DATE
Out[1]:
'20250627'

Sankey plot¶

In [2]:
from pySankey.sankey import sankey as sankeyplot
import plotly.graph_objects as go
In [3]:
working_folder = "C:/Users/Enrico/OneDrive - UGent/run-ionbot"
filtering = 'hybrid'
data = []
for dataset_name in ["PXD014258.v0.11.4","PXD002057.v0.11.4","PXD005833.v0.11.4"]:
    data.append(pd.read_csv(os.path.join(working_folder, dataset_name,
                                         f'{dataset_name[:9]}-opensearch-x-closedsearch-filt-{filtering}-outerjoin.csv.gz')))
for _ in data:
    print(_.shape)
data = pd.concat(data, ignore_index=True)
print(data.shape)

F, counts = make_sankey_plot_with_counts(data, suffixes=['_closedsearch','_opensearch'])
F.write_image(f"publication-data/{DATE}-Sankey-closedsearch-opensearch-{filtering}-filtering.svg")
F.show()
(126204, 63)
(45786, 63)
(163958, 63)
(335948, 63)
In [4]:
# searches overall overlap
closed_search_identified_spectra = data[~data.database_closedsearch.isna()].spectrum_title
open_search_identified_spectra   = data[~data.database_opensearch.isna()].spectrum_title
open_search_identified_spectra
venn2([set(closed_search_identified_spectra),set(open_search_identified_spectra)], 
      set_labels=['Closed search','Open search'],
      set_colors=sns.color_palette("Set2")[:2])
plt.title('Identified spectra overlap (all datasets)')
plt.savefig(f"publication-data/{DATE}-overall-overlap-closedsearch-opensearch-{filtering}-filtering.svg")
No description has been provided for this image
In [5]:
len(set(open_search_identified_spectra))/len(set(closed_search_identified_spectra))
Out[5]:
1.583165923082675

In [6]:
# F = make_sankey_plot_with_counts(data, sankey_labels)
# F.write_image("publication-data/{DATE}-Sankey-plot-closedsearch-opensearch.svg")
# F.show()
In [7]:
data3 = counts.loc[['Canonical+Unmodified/Expected','NonCanonical+Unmodified/Expected',
                   'Decoy','Unidentified'],
                ['Canonical+Unmodified/Expected','Canonical+Unexpected',
                 'NonCanonical+Unmodified/Expected','NonCanonical+Unexpected',
                 'Decoy','Unidentified']]
data3.style.background_gradient()
Out[7]:
sankey_opensearch Canonical+Unmodified/Expected Canonical+Unexpected NonCanonical+Unmodified/Expected NonCanonical+Unexpected Decoy Unidentified
sankey_closedsearch            
Canonical+Unmodified/Expected 161808 17531 248 738 432 17285
NonCanonical+Unmodified/Expected 119 320 393 92 0 214
Decoy 182 302 3 3 77 858
Unidentified 68154 62859 1209 1646 1475 0

In [8]:
# All spectra
tmp = data3.iloc[:,:]
print(tmp.sum().sum())
tmp
335948
Out[8]:
sankey_opensearch Canonical+Unmodified/Expected Canonical+Unexpected NonCanonical+Unmodified/Expected NonCanonical+Unexpected Decoy Unidentified
sankey_closedsearch
Canonical+Unmodified/Expected 161808 17531 248 738 432 17285
NonCanonical+Unmodified/Expected 119 320 393 92 0 214
Decoy 182 302 3 3 77 858
Unidentified 68154 62859 1209 1646 1475 0
In [9]:
# Any --> Any
tmp = data3.iloc[:-1,:-1]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
182248
54.2%
Out[9]:
sankey_opensearch Canonical+Unmodified/Expected Canonical+Unexpected NonCanonical+Unmodified/Expected NonCanonical+Unexpected Decoy
sankey_closedsearch
Canonical+Unmodified/Expected 161808 17531 248 738 432
NonCanonical+Unmodified/Expected 119 320 393 92 0
Decoy 182 302 3 3 77
In [10]:
# Any --> Any
tmp = data3.iloc[:,:-1]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
317591
94.5%
Out[10]:
sankey_opensearch Canonical+Unmodified/Expected Canonical+Unexpected NonCanonical+Unmodified/Expected NonCanonical+Unexpected Decoy
sankey_closedsearch
Canonical+Unmodified/Expected 161808 17531 248 738 432
NonCanonical+Unmodified/Expected 119 320 393 92 0
Decoy 182 302 3 3 77
Unidentified 68154 62859 1209 1646 1475
In [11]:
# Identified by closed search
tmp = data3.iloc[:-1,:]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
200605
59.7%
Out[11]:
sankey_opensearch Canonical+Unmodified/Expected Canonical+Unexpected NonCanonical+Unmodified/Expected NonCanonical+Unexpected Decoy Unidentified
sankey_closedsearch
Canonical+Unmodified/Expected 161808 17531 248 738 432 17285
NonCanonical+Unmodified/Expected 119 320 393 92 0 214
Decoy 182 302 3 3 77 858
In [12]:
# Expected --> Expected ptm
tmp = data3.iloc[:2,[0,2]]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
162568
48.4%
Out[12]:
sankey_opensearch Canonical+Unmodified/Expected NonCanonical+Unmodified/Expected
sankey_closedsearch
Canonical+Unmodified/Expected 161808 248
NonCanonical+Unmodified/Expected 119 393
In [13]:
# Expected --> Unexpected ptm
tmp = data3.iloc[:2,[1,3]]
print(tmp.sum().sum())
print(f"{tmp.sum().sum() / data3.sum().sum():.1%}")
tmp
18681
5.6%
Out[13]:
sankey_opensearch Canonical+Unexpected NonCanonical+Unexpected
sankey_closedsearch
Canonical+Unmodified/Expected 17531 738
NonCanonical+Unmodified/Expected 320 92
In [14]:
# Unidentified --> Any ID
tmp = data3.iloc[-1,:-2]
print(tmp.sum().sum())
round(tmp / tmp.sum(),2)
133868
Out[14]:
sankey_opensearch
Canonical+Unmodified/Expected       0.51
Canonical+Unexpected                0.47
NonCanonical+Unmodified/Expected    0.01
NonCanonical+Unexpected             0.01
Name: Unidentified, dtype: float64
In [15]:
# NonCanon --> Any
tmp = data3.iloc[1,:]
print(tmp.sum().sum())
round(tmp / tmp.sum(),2)
1138
Out[15]:
sankey_opensearch
Canonical+Unmodified/Expected       0.10
Canonical+Unexpected                0.28
NonCanonical+Unmodified/Expected    0.35
NonCanonical+Unexpected             0.08
Decoy                               0.00
Unidentified                        0.19
Name: NonCanonical+Unmodified/Expected, dtype: float64
In [16]:
# Canon --> Any
tmp = data3.iloc[0,:]
print(tmp.sum().sum())
round(tmp / tmp.sum(),3)
198042
Out[16]:
sankey_opensearch
Canonical+Unmodified/Expected       0.817
Canonical+Unexpected                0.089
NonCanonical+Unmodified/Expected    0.001
NonCanonical+Unexpected             0.004
Decoy                               0.002
Unidentified                        0.087
Name: Canonical+Unmodified/Expected, dtype: float64

Compare by-correlation of spectra identified in both searches (closed & open)¶

In [17]:
combo = []
for dataset_name in ["PXD014258.v0.11.4","PXD002057.v0.11.4","PXD005833.v0.11.4"]:
    tmp = pd.read_csv(os.path.join(working_folder, dataset_name,
                                   f'{dataset_name[:9]}-opensearch-x-closedsearch-filt-{filtering}.csv.gz'))
    print(tmp.shape)
    combo.append(tmp)
    del tmp
combo = pd.concat(combo, ignore_index=True)
print(combo.shape)
combo.tail()
(81739, 63)
(19051, 63)
(81458, 63)
(182248, 63)
Out[17]:
spectrum_title scan spectrum_file precursor_mass_closedsearch database_peptide_closedsearch matched_peptide_closedsearch modifications_closedsearch database_closedsearch psm_score_closedsearch global_q_closedsearch ... by-explained_opensearch b-explained_opensearch y-explained_opensearch all-explained_opensearch by-intensity-pattern-correlation_opensearch top_tag_rank_nterm_opensearch top_tag_rank_cterm_opensearch top_tag_rank_opensearch predicted_retention_time_opensearch retention_time_error_adjusted_opensearch
182243 AM9:controllerType=0 controllerNumber=1 scan=1... 13279 AM9 830.484472 STEILLR STEILLR Unmodified T 1.092700 0.000000 ... 0.2310 276 2034 0.3345 0.9570 101 0 0 3021.414430 638.892370
182244 AM9:controllerType=0 controllerNumber=1 scan=2... 23425 AM9 2372.138925 KDLYTNTVLSGGTTMYPGIADR KDLYTNTVLSGGTTMYPGIADR Unmodified T 1.076710 0.000000 ... 0.3167 683 2484 0.3896 0.9373 9 0 0 3821.265109 168.695269
182245 AM9:controllerType=0 controllerNumber=1 scan=1... 11336 AM9 869.532324 QLLVGVNK QLLVGVNK Unmodified T 1.071870 0.000000 ... 0.1264 129 1135 0.2876 0.8116 0 0 0 2379.508031 244.233851
182246 AM9:controllerType=0 controllerNumber=1 scan=7683 7683 AM9 850.525976 GIPAPAVVK GIPAPAVVK Unmodified T 1.026570 0.000000 ... 0.0852 333 519 0.1580 0.7169 2 0 0 1892.113293 232.152333
182247 AM9:controllerType=0 controllerNumber=1 scan=2... 26676 AM9 2244.041710 DLYTNTVLSGGTTMYPGIADR DLYTNTVLSGGTTMYPGIADR Unmodified T 0.796872 0.000189 ... 0.1426 193 1233 0.1668 0.9442 4 3 3 4254.157192 167.985052

5 rows × 63 columns

In [18]:
combo["Same_peptide"]=combo.matched_peptide_closedsearch==combo.matched_peptide_opensearch
combo["Same_mod_peptide"]=combo.modified_peptide_closedsearch==combo.modified_peptide_opensearch
combo["Same_mods_noRT"]=(combo.matched_peptide_closedsearch+combo.modifications_noRT_closedsearch)==(combo.matched_peptide_opensearch+combo.modifications_noRT_opensearch)

def samepep_and_samemod(row):
    if row.Same_peptide and row.Same_mods_noRT:
        return 'Same_peptidoform'
    elif row.Same_peptide and not row.Same_mods_noRT:
        return 'Positional_isoform'
    elif not row.Same_peptide and not row.Same_mods_noRT:
        return 'Different_peptide'
combo['samepep_samemod'] = combo.apply(samepep_and_samemod, axis=1)
# print(combo['samepep_samemod'].value_counts())

VAR, VAL = 'samepep_samemod','by-intensity-pattern-correlation'
for i,df in combo.groupby('samepep_samemod').__iter__():
    _,pval = stats.mannwhitneyu(df[f'{VAL}_closedsearch'],
                                df[f'{VAL}_opensearch'])
    effect_size = cohenD(df[f'{VAL}_closedsearch'],df[f'{VAL}_opensearch'])
    print(f'{i}: p={pval:.1e} {pval<.05} (n={len(df)}) effect size = {effect_size:.3f}')

ORDER = ['Same_peptidoform','Positional_isoform','Different_peptide']
tmp = combo.melt(id_vars=['spectrum_title','scan','spectrum_file','samepep_samemod'], 
                           value_vars=[f'{VAL}_closedsearch',f'{VAL}_opensearch'], 
                           var_name='search', value_name='value')
sns.catplot(data=tmp, kind='box', x='samepep_samemod', y='value', hue='search', aspect=1.25, showfliers=False, notch=True, 
            palette='Set2', order=ORDER)
plt.title('Intensity correlation across searches')
plt.ylabel('by-intensity-pattern-correlation')
plt.xlabel(None)
plt.savefig(f"publication-data/by-ions-closedsearch-opensearch-1-{filtering}.svg")
Different_peptide: p=2.5e-102 True (n=13476) effect size = 0.218
Positional_isoform: p=4.9e-02 True (n=14084) effect size = 0.032
Same_peptidoform: p=9.4e-01 False (n=154688) effect size = 0.000
No description has been provided for this image
In [19]:
combo['target_decoy'] = combo.apply(lambda row: f'{row.database_closedsearch}->{row.database_opensearch}', axis=1)
combo.target_decoy.value_counts()

VAR, VAL = 'target_decoy','by-intensity-pattern-correlation'
for i,df in combo.groupby(VAR).__iter__():
    _,pval = stats.mannwhitneyu(df[f'{VAL}_closedsearch'],
                                df[f'{VAL}_opensearch'])
    effect_size = cohenD(df[f'{VAL}_closedsearch'],df[f'{VAL}_opensearch'])
    print(f'{i}: p={pval:.1e} {pval<.05} (n={len(df)}) effect size = {effect_size:.3f}')

tmp = combo.melt(id_vars=['spectrum_title','scan','spectrum_file','target_decoy'], 
                           value_vars=[f'{VAL}_closedsearch',f'{VAL}_opensearch'], 
                           var_name='search', value_name='value')

sns.catplot(data=tmp, kind='box',   x='target_decoy', y='value', hue='search', 
            aspect=1.25, showfliers=False, notch=True, palette='Set2', order=['T->T','D->D','D->T','T->D'])
plt.title('Intensity correlation across searches')
plt.xlabel('Target or Decoy hit (closed->open search)')
plt.ylabel('by-intensity-pattern-correlation')
plt.savefig(f"publication-data/by-ions-closedsearch-opensearch-2-{filtering}.svg")
D->D: p=2.4e-02 True (n=77) effect size = 0.305
D->T: p=2.8e-26 True (n=490) effect size = 0.683
T->D: p=6.3e-02 False (n=432) effect size = 0.027
T->T: p=9.5e-08 True (n=181249) effect size = 0.016
No description has been provided for this image
In [20]:
combo['canon_before_after'] = combo.apply(lambda row: f'{row.isCanonical_closedsearch}->{row.isCanonical_opensearch}', axis=1)
# print(combo.canon_before_after.value_counts(sort=False))
ORDER = [
    'Canonical->Canonical',
    'NonCanonical->NonCanonical',
    'Canonical->NonCanonical',
    'NonCanonical->Canonical',
    
]

VAR, VAL =  'canon_before_after', 'by-intensity-pattern-correlation',

for i,df in combo.groupby(VAR).__iter__():
    _,pval = stats.mannwhitneyu(df[f'{VAL}_closedsearch'],
                                df[f'{VAL}_opensearch'])
    effect_size = cohenD(df[f'{VAL}_closedsearch'],df[f'{VAL}_opensearch'])
    print(f'{i}: p={pval:.1e} {pval<.05} (n={len(df)}) effect size = {effect_size:.3f}')

tmp = combo.melt(id_vars=['spectrum_title','scan','spectrum_file',VAR], 
                       value_vars=[f'{VAL}_closedsearch',f'{VAL}_opensearch'], 
                       var_name='search', value_name='value')
sns.catplot(data=tmp, kind='box', x=VAR, y='value', hue='search', showfliers=False,
            notch=True, palette='Set2', order=ORDER
            # aspect=2
           )
plt.xlabel('Canonical protein hit (closed->open search)')
plt.xticks(rotation=90)
plt.ylabel(VAL)
plt.savefig(f"publication-data/by-ions-closedsearch-opensearch-3-{filtering}.svg")
Canonical->Canonical: p=2.6e-07 True (n=180326) effect size = 0.015
Canonical->NonCanonical: p=1.0e-03 True (n=997) effect size = 0.162
NonCanonical->Canonical: p=2.6e-35 True (n=440) effect size = 0.891
NonCanonical->NonCanonical: p=3.5e-01 False (n=485) effect size = 0.091
No description has been provided for this image

save¶

In [ ]:
# Auto save & export notebook to html!!
autosave(extra_labels='-'+filtering)
filtering